#############################################################################
#                                                                           #
#  Enhancing Sensitivity Diagnostics for Qualitative Comparative Analysis:  #
#  A Combinatorial Approach                                                 #
#                                                                           #
#  Alrik Thiem, Reto Sp�hel, Adrian Dusa                                    #
#                                                                           #
#  14 October 2015                                                          #
#                                                                           #
#############################################################################

# load packages
#--------------
library("QCA")
library("lattice")

# set up data set with conditions/outcome sets
# Hug (2013, 258); ISO country codes
#---------------------------------------------
(dat <- data.frame(matrix(c(
   rep(1,25),rep(0,20),rep(c(0,0,1,0,0),3),
   0,0,0,1,0,0,1,0,0,0,0,rep(1,7),0,1),
   nrow = 16, byrow = TRUE, dimnames = list(c(
    "AT","DK","FI","NO","SE","AU","CA","FR",
    "US","DE","NL","CH","JP","NZ","IE","BE"),
   c("P", "U", "C", "S", "W"))
)))

# function implementing both combinatorial computation under DPA as well as IPA
#------------------------------------------------------------------------------

# mydata    : original data set (binary 0/1 variables only)
# outcome   : outcome
# type      : corruption/deletion
# assump    : assume dependent (DPA)/independent (IPA) perturbations
# n.cut     : number of cases (frequency) cut-off (beta)
# incl.cut  : inclusion cut-off (alpha)
# p.pert    : probability of perturbation under IPA (p)
# n.pert    : number of perturbations under DPA (D)

perturb <- function(mydata, outcome = c(""), type = "corruption", assump = "DPA",
                    n.cut = 1, incl.cut = 1, p.pert = 0.5, n.pert = 1) {
    names(mydata) <- toupper(names(mydata))
    conditions <- names(mydata)[which(names(mydata) != outcome)]
    outcome <- toupper(outcome)
    mydata <- mydata[, c(conditions, outcome)]
    udata <- unique(mydata[, conditions])
    rownames(udata) <- seq(nrow(udata))
    cpos <- cneg <- rep(0, nrow(udata))
    for (i in seq(nrow(udata))) {
        for (j in seq(nrow(mydata))) {
            if (all(udata[i, ] == mydata[j, conditions])) {
                if (mydata[j, outcome] == 1) {
                    cpos[i] <- cpos[i] + 1
                }
                else if (mydata[j, outcome] == 0) {
                    cneg[i] <- cneg[i] + 1
                }
            }
        }
    }
    total <- cpos + cneg
    udata <- udata[total >= n.cut, , drop = FALSE]
    cpos <- cpos[total >= n.cut]
    cneg <- cneg[total >= n.cut]
    total <- total[total >= n.cut]
    calculatePairs <- function(x, n.pert, type = "deletion") {
        pairsxl <- combn(nrow(udata), min(x, nrow(udata)))
        nofsetsxl <- 0
        for (j in seq(ncol(pairsxl))) {
            cposneg <- NULL
            for (l in seq(min(x, nrow(pairsxl)))) {
                cposneg <- c(cposneg, cbind(cpos, cneg)[pairsxl[l, j], ])
            }
            allpairs <- createMatrix(cposneg + 2) - 1
            allpairs <- allpairs[apply(allpairs, 1, function(y) all(y >= 0)), , drop = FALSE]
            linesubset <- rep(TRUE, nrow(allpairs))
            for (l in seq(min(x, nrow(pairsxl)))) {
                linesubset <- linesubset & rowSums(allpairs[, l*2 - c(1, 0)]) >= 1
            }
            allpairs <- allpairs[linesubset & rowSums(allpairs) <= n.pert, , drop = FALSE]
            for (i in seq(nrow(allpairs))) {
                lchanges <- rep(FALSE, min(x, nrow(pairsxl)))
                for (l in seq(min(x, nrow(pairsxl)))) {
                    initially <- cpos[pairsxl[l, j]]/total[pairsxl[l, j]]
                    if (type == "deletion") {
                        newtotaless <- total[pairsxl[l, j]] - allpairs[i, l*2 - 1]
                        after <- (cpos[pairsxl[l, j]] - allpairs[i, l*2 - 1])/newtotaless
                        lchanges[l] <- ((initially >= incl.cut & after <  incl.cut) | newtotaless <  n.cut) |
                                       ((initially <  incl.cut & after >= incl.cut) & newtotaless >= n.cut)
                    }
                    else if (type == "corruption") {
                        after <- (cpos[pairsxl[l, j]] + allpairs[i, l*2] - allpairs[i, l*2 - 1])/total[pairsxl[l, j]]
                        lchanges[l] <- (initially >= incl.cut & after <  incl.cut) |
                                       (initially <  incl.cut & after >= incl.cut)
                    }
                }
                if (all(lchanges)) {
                    combs <- 1
                    for (l in seq(min(x, nrow(pairsxl)))) {
                        combs <- combs*choose(cpos[pairsxl[l, j]], allpairs[i, l*2 - 1])
                        combs <- combs*choose(cneg[pairsxl[l, j]], allpairs[i, l*2])
                    }
                    nofsetsxl <- nofsetsxl + combs*choose(nrow(dat) - sum(cposneg), n.pert - sum(allpairs[i, ]))
                }
            }
        }
        return(nofsetsxl)
    }
    if (assump == "DPA") {
        nofsets <- 0
        totalsets <- choose(nrow(dat), n.pert)
        for (i in seq(n.pert)) {
            if (nofsets != totalsets) {
                nofsetsxl <- calculatePairs(i, n.pert, type = type)
                nofsets <- nofsets + ifelse(i %% 2 == 1, nofsetsxl, -1*nofsetsxl)
            }
        }
        return(as.vector(1 - nofsets/totalsets))
    }
    else if (assump == "IPA") {
        pfinal <- 1
        if (type == "deletion") {
            for (l in seq(nrow(udata))) {
                ptmp <- 1
                allpairs <- createMatrix(c(cpos[l], cneg[l]) + 2) - 1
                allpairs <- allpairs[apply(allpairs, 1, function(x) all(x >= 0)), , drop = FALSE]
                allpairs <- allpairs[rowSums(allpairs) >= 1, , drop = FALSE]
                for (i in seq(nrow(allpairs))) {
                    newtotaless <- total[l] - allpairs[i, 1] - allpairs[i, 2]
                    initially <- cpos[l]/total[l]
                    after <- (cpos[l] - allpairs[i, 1])/newtotaless
                    if (((initially >= incl.cut & after <  incl.cut) | newtotaless <  n.cut) |
                        ((initially <  incl.cut & after >= incl.cut) & newtotaless >= n.cut)) {
                           ptmp <- ptmp - dbinom(allpairs[i, 1], cpos[l], p.pert) * dbinom(allpairs[i, 2], cneg[l], p.pert)
                    }
                }
                pfinal <- pfinal*ptmp
            }
        }
        else if (type == "corruption") {
            for (l in seq(nrow(udata))) {
                ptmp <- 1
                allpairs <- createMatrix(c(cpos[l], cneg[l]) + 2) - 1
                allpairs <- allpairs[apply(allpairs, 1, function(x) all(x >= 0)), , drop = FALSE]
                allpairs <- allpairs[rowSums(allpairs) >= 1, , drop = FALSE]
                for (i in seq(nrow(allpairs))) {
                    initially <- cpos[l]/total[l]
                    after <- (cpos[l] - allpairs[i, 1] + allpairs[i, 2])/total[l]
                    if ((initially >= incl.cut & after < incl.cut) | (initially < incl.cut & after >= incl.cut)) {
                           ptmp <- ptmp - dbinom(allpairs[i, 1], cpos[l], p.pert) * dbinom(allpairs[i, 2], cneg[l], p.pert)
                    }
                }
                pfinal <- pfinal*ptmp
            }
        }
        return(pfinal)
    }
}


# figures for level plots (Figures 2a, 2b, 3a, 3b)
#-------------------------------------------------

# inclusion cut-offs
inc <- seq(0.50, 1, by = 0.01)
inc <- seq(0.50, 1, by = 0.1) # short-cut for n.deletions
# number / probability of perturbation
n.corruption <- 1:5
n.deletion <- 1:12
p.corruption <- seq(0.01, 0.3, 0.01)
p.deletion <- seq(0.01, 0.5, 0.01)
# thetas
th.n.corruption <- numeric(length(inc)*length(n.corruption))
th.n.deletion <- numeric(length(inc)*length(n.deletion))
th.p.corruption <- numeric(length(inc)*length(p.corruption))
th.p.deletion <- numeric(length(inc)*length(p.deletion))

# dependent corruption
l <- 0
for(j in min(n.corruption):max(n.corruption)){
  for(i in 1:length(inc)){
      l <- l + 1
       w.corruption <- perturb(dat, outcome = "W", type = "corruption", n.pert = j,
        incl.cut = inc[i], assump = "DPA")
       th.n.corruption[l] <- w.corruption
  }
}

# dependent deletions
l <- 0
for(j in min(n.deletion):max(n.deletion)){
  for(i in 1:length(inc)){
      l <- l + 1
       w.deletion <- perturb(dat, outcome = "W", type = "deletion", n.pert = j,
         incl.cut = inc[i], assump = "DPA")
       th.n.deletion[l] <- w.deletion
  }
}

# independent corruptions
l <- 0
for(j in 1:length(p.corruption)){
  for(i in 1:length(inc)){
      l <- l + 1
       w.corruption <- perturb(dat, outcome = "W", type = "corruption",
        incl.cut = inc[i], assump = "IPA", p.pert = p.corruption[j])
       th.p.corruption[l] <- w.corruption
  }
}

# independent deletions
l <- 0
for(j in 1:length(p.deletion)){
  for(i in 1:length(inc)){
      l <- l + 1
       w.deletion <- perturb(dat, outcome = "W", type = "deletion", incl.cut = inc[i],
        assump = "IPA", p.pert = p.deletion[j])
       th.p.deletion[l] <- w.deletion
  }
}


sim.res.n.corruption <- cbind(expand.grid(inc, n.corruption), th.n.corruption)
sim.res.n.deletion <- cbind(expand.grid(inc, n.deletion), th.n.deletion)
sim.res.p.corruption <- cbind(expand.grid(inc, p.corruption), th.p.corruption)
sim.res.p.deletion <- cbind(expand.grid(inc, p.deletion), th.p.deletion)

names(sim.res.n.corruption) <- c("inc", "n.corruption", "th.n.corruption")
names(sim.res.n.deletion) <- c("inc", "n.deletion", "th.n.deletion")
names(sim.res.p.corruption) <- c("inc", "p.corruption", "th.p.corruption")
names(sim.res.p.deletion) <- c("inc", "p.deletion", "th.p.deletion")

# plot results in level plots (Figures 2a, 2b, 3a, 3b)
#-----------------------------------------------------

windows(width = 4.5, height = 3.5)
levelplot(th.n.corruption ~ inc * n.corruption,
  data = sim.res.n.corruption,
  col.regions = rev(grey.colors(15, start = 0, end = 1)),
  colorkey = list(labels = list(labels = as.character(c(0.75, seq(0.6,0,-0.1))),
                  at = c(0.75, seq(0.6,0,-0.1)))),
  xlab = expression(paste("Inclusion cut-off (", alpha, ")")),
  ylab = expression(paste("Number of corruptions ( ", italic(D), ")")),
  at = seq(0, 0.75, 0.05),
  aspect = 1/1,
  scales = list(relation = "free",
                x = list(at = seq(0.5,1,0.1)),
                y = list(at = min(n.corruption):max(n.corruption),
                         labels = c("1","2","3","4","5"))
                )
)

windows(width = 4.5, height = 3.5)
levelplot(th.n.deletion ~ inc * n.deletion,
  data = sim.res.n.deletion,
  col.regions = rev(grey.colors(18, start = 0, end = 1)),
  colorkey = list(labels = list(labels = as.character(seq(0,0.9,0.1)),
                  at = seq(0,0.9,0.1))),
  xlab = expression(paste("Inclusion cut-off (", alpha, ")")),
  ylab = expression(paste("Number of deletions ( ", italic(D), ")")),
  at = seq(0, 0.9, 0.05),
  aspect = 1/1,
  scales = list(relation = "free",
                x = list(at = seq(0.5,1,0.1)),
                y = list(at = min(n.deletion):max(n.deletion),
                        labels = as.character(1:12))
                )
)

windows(width = 4.5, height = 3.5)
levelplot(th.p.corruption ~ inc * p.corruption,
  data = sim.res.p.corruption,
  col.regions = rev(grey.colors(20, start = 0, end = 1)),
  colorkey = list(tick.number = 11),
  xlab = expression(paste("Inclusion cut-off (", alpha, ")")),
  ylab = expression(paste("Probability of corruption ( ", italic(p), ")")),
  at = seq(0, 1, 0.05),
  aspect = 1/1,
  scales = list(relation = "free",
                x = list(at = seq(0.5,1,0.1)),
                y = list(at = c(0.01, seq(0.05, 0.25, 0.05), 0.3),
                         labels = as.character(c(0.01, seq(0.05, 0.25, 0.05), 0.3)))
                )
)


windows(width = 4.5, height = 3.5)
levelplot(th.p.deletion ~ inc * p.deletion,
  data = sim.res.p.deletion,
  col.regions = rev(grey.colors(20, start = 0, end = 1)),
  colorkey = list(tick.number = 11),
  xlab = expression(paste("Inclusion cut-off (", alpha, ")")),
  ylab = expression(paste("Probability of deletion ( ", italic(p), ")")),
  at = seq(0, 1, 0.05),
  aspect = 1/1,
  scales = list(relation = "free",
                x = list(at = seq(0.5,1,0.1)),
                y = list(at = c(0.01, seq(0.05, 0.5, 0.05)),
                         labels = as.character(c(0.01, seq(0.05, 0.5, 0.05))))
                )
)

# function integrating and extending Hug's original R functions
# (exhaustive enumeration)
perturbExhaust <- function(mydata, outcome = c(""), incl.cut = 1, n.pert = 1,
                           type = "corruption") {
    cbs <- combn(nrow(mydata), n.pert)
    solist <- rep(list(0), ncol(cbs))
    if (type == "corruption") {
        for (i in seq(length(solist))) {
            try(solist[[i]] <- with(mydata, {
                    mydata[cbs[, i], outcome] <- 1 - mydata[cbs[, i], outcome]
                    eqmcc(mydata, outcome = outcome,
                          incl.cut1 = incl.cut)$solution[[1]]
                }), silent = TRUE)
        }
    }
    else if (type == "deletion") {
        for (i in seq(length(solist))) {
            try(solist[[i]] <- eqmcc(mydata[-cbs[, i], ], outcome = outcome,
                                     incl.cut1 = incl.cut)$solution[[1]]
                , silent = TRUE)
        }
    }
    tsl <- table(unlist(lapply(solist, function(x) paste(x, collapse="+"))))
    return(matrix(c(seq(length(tsl)), tsl, prop.table(tsl)), ncol = 3,
                  dimnames = list(names(tsl), c("ID", "N", "Prob"))))
}

# performance tests (section 4.4)
#--------------------------------

corruptions <- 1:16
sink("exhaustive_corruption.csv")
cat("\"Corruptions\",\"Time\",\"Memory\"\n")
sink()
for (i in corruptions) {
    gc()
    memory <- memory.size()
    timetaken <- system.time(perturbExhaust(dat, "W", incl.cut = 0.8,
      n.pert = corruptions[i], type = "corruption"))[3]
    memory <- memory.size() - memory
    sink("exhaustive_corruption.csv", append = TRUE)
    cat(paste(corruptions[i], timetaken, memory, sep = ","), "\n", sep = "")
    sink()
}

probs <- seq(0, 1, by = 0.001)
sink("combinatorial_IPA_corruption.csv")
cat("\"Prob\",\"Time\",\"Mem\"\n")
sink()

for (i in seq(length(probs))) {
    gc()
    memory <- memory.size()
    timetaken <- system.time(perturb(dat, "W", incl.cut = 0.8, p.pert = probs[i],
      assump = "IPA", type = "corruption"))[3]
    memory <- memory.size() - memory
    sink("combinatorial_IPA_corruption.csv", append = TRUE)
    cat(paste(probs[i], timetaken, memory, sep = ","), "\n", sep = "")
    sink()
}

corruptions <- 1:16
sink("combinatorial_DPA_corruption.csv")
cat("\"Corruptions\",\"Time\",\"Result\"\n")
sink()

for (i in seq(length(corruptions))) {
    gc()
    memory <- memory.size()
    timetaken <- system.time(result <- perturb(dat, "W", incl.cut = 0.8,
      n.pert = corruptions[i], assump = "DPA", type = "corruption"))[3]
    memory <- memory.size() - memory
    sink("combinatorial_DPA_corruption.csv", append = TRUE)
    cat(paste(corruptions[i], timetaken, result, sep = ","), "\n", sep = "")
    sink()
}

# for statistics with respect to loss of data, replace "type = 'corruption'" by
# "type = 'deletion'"

# code for Figure 4
C.DPA.corruption <- read.csv("combinatorial_DPA_corruption.csv")
C.IPA.corruption <- read.csv("combinatorial_IPA_corruption.csv")
E.corruption <- read.csv("exhaustive_corruption.csv")

yl <- c(0.01,0.025,0.05,0.1,0.25,0.5,1,2.5,5,10,25,50,100,200)
windows(width = 7.5, height = 6.25)
par(oma = c(3,1,0,0))
plot(E.corruption$Corruptions, E.corruption$Time, log = "y", type = "n", yaxt = "n",
  xlim = c(0,16), ylim = c(0.01,200), xaxp = c(0,16,16), xlab = "", ylab = "")
segments(rep(-0.5,14), yl, rep(16.5,14), yl, lty = 2, col = grey(0.85))
axis(1, seq(0, 16, by = 0.1*16), labels = seq(0, 1, by = 0.1), line = 2.75)
axis(2, 0:170, at = c(0.01,0.1,1,10,100), las = 1, labels = c("0.01","0.1","1","10","100"),
  font = 2, cex.axis = 1.25)
axis(2, 0:170, at = c(0.025,0.05,0.25,0.5,2.5,5,25,50,200), las = 1,
  labels = c("0.025","0.05","0.25","0.5","2.5","5","25","50","200"))
points(E.corruption$Corruptions, E.corruption$Time, type = "s")
points(E.corruption$Corruptions, E.corruption$Time,
  pch = ifelse(E.corruption$Corruptions < 9, 16, 21), cex = 1.3,
  bg = ifelse(E.corruption$Corruptions < 9, "black", "white"))
points(C.DPA.corruption$Corruptions,C.DPA.corruption$Time, type = "s")
points(C.DPA.corruption$Corruptions,C.DPA.corruption$Time,
  pch = ifelse(C.DPA.corruption$Corruptions < 9, 15, 22), cex = 1.3,
  bg = ifelse(C.DPA.corruption$Corruptions < 9, "black", "white"))
points(C.IPA.corruption$Prob[seq(1,1001, by = 100)]*16,
  C.IPA.corruption$Time[seq(1,1001, by = 100)], type = "l")
points(C.IPA.corruption$Prob[seq(1,1001, by = 100)]*16,
  C.IPA.corruption$Time[seq(1,1001, by = 100)],
  pch = ifelse(E.corruption$Corruptions < 7, 17, 24), cex = 1.3,
  bg = ifelse(E.corruption$Corruptions < 7, "black", "white"))
mtext("Number of Corruptions / Probability of Corruption", 1, line = 5.5, at = 8)
mtext("Time to Completion (in seconds; logged scale)", 2, line = 3.5)
text(5, 30, labels = "Exhaustive Enumeration", pos = 4, cex = 1.2)
text(5, 1, labels = "Combinatorial Computation (DPA)", pos = 4, cex = 1.2)
text(5, 0.05, labels = "Combinatorial Computation (IPA)", pos = 4, cex = 1.2)


